from IPython.core.display import display, HTML
display(HTML("""<style> .container {width:96% !important;}</style>"""))
from IPython.display import IFrame
import pandas as pd
import numpy as np
from __future__ import division
import xgboost as xgb
import sys
sys.path.insert(0,'../')
from utils.paths import *

import pickle
with fs.open(path_SBA + 'clf_xgb.dat', 'rb') as fp_in:
clf_xgb = pickle.load(fp_in)
with fs.open(path_SBA + 'bst_ex.dat', 'rb') as fp_in:
bst_ex = pickle.load(fp_in)
with fs.open(path_SBA + 'dict_categorical.pkl', 'rb') as fp_in:
dict_categorical = pickle.load(fp_in);
result_table_proj = pd.read_csv(path_SBA + 'result_table_proj.csv', sep = ';', low_memory = False)
proj_bas = pd.read_csv(path_SBA + 'proj_bas.csv', sep = ';', low_memory = False)
nat5 = pd.read_csv(path_SBA + 'nat5.csv', sep = ';', low_memory = False)
import eli5
(Similar to features important table)
eli5.show_weights(bst_ex, vec = dict_categorical, importance_type="weight")
result_table_proj.head()
proj_bas.head()
# Example of good companies
result_table_proj[(result_table_proj.default == 0) & (result_table_proj.Grade == 1)].sample(5, random_state=1)
# Example of bad companies
result_table_proj[(result_table_proj.default == 1) & (result_table_proj.Grade == 5)].sample(5, random_state=1)
# Example of good company
eli5.show_prediction(bst_ex, proj_bas.loc[55707], show_feature_values=True)
# Example of bad companiy
eli5.show_prediction(bst_ex, proj_bas.loc[25177], show_feature_values=True)
nat5.loc[25177][['Bank', 'BankState', 'fips', 'RealEstate', 'NAICS_group']]
nat5[nat5.default == 1].Bank.value_counts().head()
nat5[nat5.default == 1].BankState.value_counts().head()
var_group = {'Location': ['BankState_INT', 'fips_INT', 'UrbanRural', 'State_INT', 'Zip5d_INT', 'Zip3d_INT', 'City_INT'],
'Bus. Size': ['RealEstate', 'NewExist', 'NoEmp', 'BusinessType_INT'],
'Bus. Status': ['Expanding', 'Expanding_ratio_INT', 'Retaining', 'CreateJob', 'Retaining_ratio_INT', 'RetainedJob'],
'Bus. Sector': ['NAICS_group_INT', 'NAICS_default_rate', 'suffix_INT'],
'Loan quality': ['Bank_INT', 'LowDoc_INT', 'RevLineCr_INT'],
'Past records': ['Loan_age', 'Previous_loan', 'default_times']
}
def var_group_contribution(LOC, model = bst_ex, m_input = proj_bas, var_group = var_group):
df = eli5.explain_prediction_df(model, m_input.loc[LOC])
group_contribution = {}
for group in var_group.keys():
group_contribution[group] = df[df.feature.isin(var_group[group])].weight.sum()
return group_contribution
# Example of good company
var_group_contribution(55707)
# Example of bad company
var_group_contribution(25177)
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import cufflinks as cf
init_notebook_mode()
cf.go_offline()
def plot_var_group_contribution(LOC, model = bst_ex, m_input = proj_bas, var_group = var_group):
var_group_con = var_group_contribution(LOC, model, m_input, var_group)
_theta = var_group_con.keys() + [var_group_con.keys()[0]]
data = [go.Scatterpolar(
theta = _theta,
r = [var_group_con[k] for k in _theta],
fill = 'toself'
)]
layout = go.Layout(
polar = dict(
radialaxis = dict(
visible = True,
range = [-1, 1]
)
),
showlegend = False
)
fig = go.Figure(data=data, layout=layout)
return iplot(fig)
# Example of good company
plot_var_group_contribution(55707)
# Example of bad company
plot_var_group_contribution(25177)